import os
import re
import pandas as pd

reg_pet = re.compile(r'[^_]+_[^_]+_([^_]+)')
COL3 = '三级标签'
COL3_2 = '三级标签2'
COL_IS_SAME = '标签是否一致'
COL_2_LESS = '三级标签2缺少'
COL_2_MORE = '三级标签2多出'
DOWNLOAD_FOLDER = 'results'


def extract_pets_from_part(part):
    s = set()
    m = reg_pet.search(part)
    if m:
        s.add(m.group(1))
    elif part != '其他':
        s.add(part)
    return s


pets_cache = {}


def extract_pets_from_cell(cell_value):
    # print(cell_value)
    global pets_cache
    if cell_value in pets_cache:
        return pets_cache[cell_value]
    all_pets = set()
    for p in cell_value.split(','):
        all_pets = all_pets.union(extract_pets_from_part(p))
    pets_cache[cell_value] = all_pets
    return all_pets


def generate_column_is_same(row):
    a, b = extract_pets_from_cell(row[COL3]), extract_pets_from_cell(row[COL3_2])
    return '是' if a == b else '否'


def generate_column_less(row):
    a, b = extract_pets_from_cell(row[COL3]), extract_pets_from_cell(row[COL3_2])
    return ','.join(a - b)


def generate_column_more(row):
    a, b = extract_pets_from_cell(row[COL3]), extract_pets_from_cell(row[COL3_2])
    return ','.join(b - a)


def download(url, full_name):
    print(f'(需要你自己实现) 下载 {url} 到 {full_name} ')


if __name__ == "__main__":
    input_file = r'原始数据.xlsx'
    output_file = r'效果1.xlsx'
    df = pd.read_excel(input_file)
    df = df.fillna('')
    # print(df)
    # df_label3 = df[[COL3, COL3_2]]
    # print(df_label3)
    df[COL_IS_SAME] = df.apply(generate_column_is_same, axis='columns')
    df[COL_2_LESS] = df.apply(generate_column_less, axis='columns')
    df[COL_2_MORE] = df.apply(generate_column_more, axis='columns')
    df.to_excel(output_file, index=False,
                # 仿照这个格式，添加你想输出的列
                columns=[COL3, COL3_2, COL_IS_SAME, COL_2_LESS, COL_2_MORE])

    os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
    for index, row in df.iterrows():
        if row[COL_IS_SAME] == '否':
            # print(row[COL_IS_SAME], row['内容ID'])
            full_name = os.path.join(DOWNLOAD_FOLDER, row['内容ID'])
            download(row['视频链接'], full_name)

    # df_label3 = df[[COL3, COL3_2, COL_IS_SAME, COL_2_LESS, COL_2_MORE]]
    # print(df_label3)
    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #     print(df_label3)
